Review the structure and content of the data and answer questions such as: Are the features (columns) of your data correlated? What is the overall distribution of each variable? Are there any outliers present? What are the relationships between different variables? How are categorical variables distributed? Do any patterns or trends emerge in the data? What is the central tendency and spread of each variable? Are there any missing values and how significant are they?
# Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(ggpubr)
library(naniar) # for missing value visualization
library(DataExplorer) # optional: automated EDA
# Load Dataset
url <- "https://raw.githubusercontent.com/uzmabb182/Data_622/refs/heads/main/Assignment_1_EDA/bank-additional-full.csv"
bank_additional_df <- read.csv2(url, stringsAsFactors = FALSE)
head(bank_additional_df)
## age job marital education default housing loan contact month
## 1 56 housemaid married basic.4y no no no telephone may
## 2 57 services married high.school unknown no no telephone may
## 3 37 services married high.school no yes no telephone may
## 4 40 admin. married basic.6y no no no telephone may
## 5 56 services married high.school no no yes telephone may
## 6 45 services married basic.9y unknown no no telephone may
## day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 mon 261 1 999 0 nonexistent 1.1
## 2 mon 149 1 999 0 nonexistent 1.1
## 3 mon 226 1 999 0 nonexistent 1.1
## 4 mon 151 1 999 0 nonexistent 1.1
## 5 mon 307 1 999 0 nonexistent 1.1
## 6 mon 198 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
# Basic structure
str(bank_additional_df)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr "housemaid" "services" "services" "admin." ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education : chr "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr "no" "unknown" "no" "no" ...
## $ housing : chr "no" "no" "yes" "no" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : chr "1.1" "1.1" "1.1" "1.1" ...
## $ cons.price.idx: chr "93.994" "93.994" "93.994" "93.994" ...
## $ cons.conf.idx : chr "-36.4" "-36.4" "-36.4" "-36.4" ...
## $ euribor3m : chr "4.857" "4.857" "4.857" "4.857" ...
## $ nr.employed : chr "5191" "5191" "5191" "5191" ...
## $ y : chr "no" "no" "no" "no" ...
# Dimensions
dim(bank_additional_df) # rows, columns
## [1] 41188 21
nrow(bank_additional_df) # number of rows
## [1] 41188
ncol(bank_additional_df) # number of columns
## [1] 21
# Column names
names(bank_additional_df)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
# Summary statistics for all variables
summary(bank_additional_df)
## age job marital education
## Min. :17.00 Length:41188 Length:41188 Length:41188
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.02
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:41188 Length:41188 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.3 Mean : 2.568
## 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.000 Length:41188 Length:41188
## 1st Qu.:999.0 1st Qu.:0.000 Class :character Class :character
## Median :999.0 Median :0.000 Mode :character Mode :character
## Mean :962.5 Mean :0.173
## 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :999.0 Max. :7.000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## y
## Length:41188
## Class :character
## Mode :character
##
##
##
# First and last few records
head(bank_additional_df, 10)
## age job marital education default housing loan contact
## 1 56 housemaid married basic.4y no no no telephone
## 2 57 services married high.school unknown no no telephone
## 3 37 services married high.school no yes no telephone
## 4 40 admin. married basic.6y no no no telephone
## 5 56 services married high.school no no yes telephone
## 6 45 services married basic.9y unknown no no telephone
## 7 59 admin. married professional.course no no no telephone
## 8 41 blue-collar married unknown unknown no no telephone
## 9 24 technician single professional.course no yes no telephone
## 10 25 services single high.school no yes no telephone
## month day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 may mon 261 1 999 0 nonexistent 1.1
## 2 may mon 149 1 999 0 nonexistent 1.1
## 3 may mon 226 1 999 0 nonexistent 1.1
## 4 may mon 151 1 999 0 nonexistent 1.1
## 5 may mon 307 1 999 0 nonexistent 1.1
## 6 may mon 198 1 999 0 nonexistent 1.1
## 7 may mon 139 1 999 0 nonexistent 1.1
## 8 may mon 217 1 999 0 nonexistent 1.1
## 9 may mon 380 1 999 0 nonexistent 1.1
## 10 may mon 50 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
## 7 93.994 -36.4 4.857 5191 no
## 8 93.994 -36.4 4.857 5191 no
## 9 93.994 -36.4 4.857 5191 no
## 10 93.994 -36.4 4.857 5191 no
tail(bank_additional_df, 10)
## age job marital education default housing loan
## 41179 62 retired married university.degree no no no
## 41180 64 retired divorced professional.course no yes no
## 41181 36 admin. married university.degree no no no
## 41182 37 admin. married university.degree no yes no
## 41183 29 unemployed single basic.4y no yes no
## 41184 73 retired married professional.course no yes no
## 41185 46 blue-collar married professional.course no no no
## 41186 56 retired married university.degree no yes no
## 41187 44 technician married professional.course no no no
## 41188 74 retired married professional.course no yes no
## contact month day_of_week duration campaign pdays previous poutcome
## 41179 cellular nov thu 483 2 6 3 success
## 41180 cellular nov fri 151 3 999 0 nonexistent
## 41181 cellular nov fri 254 2 999 0 nonexistent
## 41182 cellular nov fri 281 1 999 0 nonexistent
## 41183 cellular nov fri 112 1 9 1 success
## 41184 cellular nov fri 334 1 999 0 nonexistent
## 41185 cellular nov fri 383 1 999 0 nonexistent
## 41186 cellular nov fri 189 2 999 0 nonexistent
## 41187 cellular nov fri 442 1 999 0 nonexistent
## 41188 cellular nov fri 239 3 999 1 failure
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
## 41179 -1.1 94.767 -50.8 1.031 4963.6 yes
## 41180 -1.1 94.767 -50.8 1.028 4963.6 no
## 41181 -1.1 94.767 -50.8 1.028 4963.6 no
## 41182 -1.1 94.767 -50.8 1.028 4963.6 yes
## 41183 -1.1 94.767 -50.8 1.028 4963.6 no
## 41184 -1.1 94.767 -50.8 1.028 4963.6 yes
## 41185 -1.1 94.767 -50.8 1.028 4963.6 no
## 41186 -1.1 94.767 -50.8 1.028 4963.6 no
## 41187 -1.1 94.767 -50.8 1.028 4963.6 yes
## 41188 -1.1 94.767 -50.8 1.028 4963.6 no
# Missing values per column
missing_summary <- bank_additional_df %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Missing_Count") %>%
mutate(Missing_Percent = round(Missing_Count / nrow(bank_additional_df) * 100, 2)) %>%
arrange(desc(Missing_Count))
missing_summary
## # A tibble: 21 × 3
## Variable Missing_Count Missing_Percent
## <chr> <int> <dbl>
## 1 age 0 0
## 2 job 0 0
## 3 marital 0 0
## 4 education 0 0
## 5 default 0 0
## 6 housing 0 0
## 7 loan 0 0
## 8 contact 0 0
## 9 month 0 0
## 10 day_of_week 0 0
## # ℹ 11 more rows
# Unique values in categorical variables (factor/character columns)
lapply(bank_additional_df[sapply(bank_additional_df, is.character)], unique)
## $job
## [1] "housemaid" "services" "admin." "blue-collar"
## [5] "technician" "retired" "management" "unemployed"
## [9] "self-employed" "unknown" "entrepreneur" "student"
##
## $marital
## [1] "married" "single" "divorced" "unknown"
##
## $education
## [1] "basic.4y" "high.school" "basic.6y"
## [4] "basic.9y" "professional.course" "unknown"
## [7] "university.degree" "illiterate"
##
## $default
## [1] "no" "unknown" "yes"
##
## $housing
## [1] "no" "yes" "unknown"
##
## $loan
## [1] "no" "yes" "unknown"
##
## $contact
## [1] "telephone" "cellular"
##
## $month
## [1] "may" "jun" "jul" "aug" "oct" "nov" "dec" "mar" "apr" "sep"
##
## $day_of_week
## [1] "mon" "tue" "wed" "thu" "fri"
##
## $poutcome
## [1] "nonexistent" "failure" "success"
##
## $emp.var.rate
## [1] "1.1" "1.4" "-0.1" "-0.2" "-1.8" "-2.9" "-3.4" "-3" "-1.7" "-1.1"
##
## $cons.price.idx
## [1] "93.994" "94.465" "93.918" "93.444" "93.798" "93.2" "92.756" "92.843"
## [9] "93.075" "92.893" "92.963" "92.469" "92.201" "92.379" "92.431" "92.649"
## [17] "92.713" "93.369" "93.749" "93.876" "94.055" "94.215" "94.027" "94.199"
## [25] "94.601" "94.767"
##
## $cons.conf.idx
## [1] "-36.4" "-41.8" "-42.7" "-36.1" "-40.4" "-42" "-45.9" "-50" "-47.1"
## [10] "-46.2" "-40.8" "-33.6" "-31.4" "-29.8" "-26.9" "-30.1" "-33" "-34.8"
## [19] "-34.6" "-40" "-39.8" "-40.3" "-38.3" "-37.5" "-49.5" "-50.8"
##
## $euribor3m
## [1] "4.857" "4.856" "4.855" "4.859" "4.86" "4.858" "4.864" "4.865" "4.866"
## [10] "4.967" "4.961" "4.959" "4.958" "4.96" "4.962" "4.955" "4.947" "4.956"
## [19] "4.966" "4.963" "4.957" "4.968" "4.97" "4.965" "4.964" "5.045" "5"
## [28] "4.936" "4.921" "4.918" "4.912" "4.827" "4.794" "4.76" "4.733" "4.7"
## [37] "4.663" "4.592" "4.474" "4.406" "4.343" "4.286" "4.245" "4.223" "4.191"
## [46] "4.153" "4.12" "4.076" "4.021" "3.901" "3.879" "3.853" "3.816" "3.743"
## [55] "3.669" "3.563" "3.488" "3.428" "3.329" "3.282" "3.053" "1.811" "1.799"
## [64] "1.778" "1.757" "1.726" "1.703" "1.687" "1.663" "1.65" "1.64" "1.629"
## [73] "1.614" "1.602" "1.584" "1.574" "1.56" "1.556" "1.548" "1.538" "1.531"
## [82] "1.52" "1.51" "1.498" "1.483" "1.479" "1.466" "1.453" "1.445" "1.435"
## [91] "1.423" "1.415" "1.41" "1.405" "1.406" "1.4" "1.392" "1.384" "1.372"
## [100] "1.365" "1.354" "1.344" "1.334" "1.327" "1.313" "1.299" "1.291" "1.281"
## [109] "1.266" "1.25" "1.244" "1.259" "1.264" "1.27" "1.262" "1.26" "1.268"
## [118] "1.286" "1.252" "1.235" "1.224" "1.215" "1.206" "1.099" "1.085" "1.072"
## [127] "1.059" "1.048" "1.044" "1.029" "1.018" "1.007" "0.996" "0.979" "0.969"
## [136] "0.944" "0.937" "0.933" "0.927" "0.921" "0.914" "0.908" "0.903" "0.899"
## [145] "0.884" "0.883" "0.881" "0.879" "0.873" "0.869" "0.861" "0.859" "0.854"
## [154] "0.851" "0.849" "0.843" "0.838" "0.834" "0.829" "0.825" "0.821" "0.819"
## [163] "0.813" "0.809" "0.803" "0.797" "0.788" "0.781" "0.778" "0.773" "0.771"
## [172] "0.77" "0.768" "0.766" "0.762" "0.755" "0.749" "0.743" "0.741" "0.739"
## [181] "0.75" "0.753" "0.754" "0.752" "0.744" "0.74" "0.742" "0.737" "0.735"
## [190] "0.733" "0.73" "0.731" "0.728" "0.724" "0.722" "0.72" "0.719" "0.716"
## [199] "0.715" "0.714" "0.718" "0.721" "0.717" "0.712" "0.71" "0.709" "0.708"
## [208] "0.706" "0.707" "0.7" "0.655" "0.654" "0.653" "0.652" "0.651" "0.65"
## [217] "0.649" "0.646" "0.644" "0.643" "0.639" "0.637" "0.635" "0.636" "0.634"
## [226] "0.638" "0.64" "0.642" "0.645" "0.659" "0.663" "0.668" "0.672" "0.677"
## [235] "0.682" "0.683" "0.684" "0.685" "0.688" "0.69" "0.692" "0.695" "0.697"
## [244] "0.699" "0.701" "0.702" "0.704" "0.711" "0.713" "0.723" "0.727" "0.729"
## [253] "0.732" "0.748" "0.761" "0.767" "0.782" "0.79" "0.793" "0.802" "0.81"
## [262] "0.822" "0.827" "0.835" "0.84" "0.846" "0.87" "0.876" "0.885" "0.889"
## [271] "0.893" "0.896" "0.898" "0.9" "0.904" "0.905" "0.895" "0.894" "0.891"
## [280] "0.89" "0.888" "0.886" "0.882" "0.88" "0.878" "0.877" "0.942" "0.953"
## [289] "0.956" "0.959" "0.965" "0.972" "0.977" "0.982" "0.985" "0.987" "0.993"
## [298] "1" "1.008" "1.016" "1.025" "1.032" "1.037" "1.043" "1.045" "1.047"
## [307] "1.05" "1.049" "1.046" "1.041" "1.04" "1.039" "1.035" "1.03" "1.031"
## [316] "1.028"
##
## $nr.employed
## [1] "5191" "5228.1" "5195.8" "5176.3" "5099.1" "5076.2" "5017.5" "5023.5"
## [9] "5008.7" "4991.6" "4963.6"
##
## $y
## [1] "no" "yes"
library(dplyr)
library(tidyr)
# Select only character (categorical) columns
categorical_df <- bank_additional_df %>% select(where(is.character))
# Create readable frequency tables
freq_tables <- lapply(names(categorical_df), function(col) {
categorical_df %>%
count(!!sym(col)) %>% # count occurrences of each unique value
arrange(desc(n)) %>% # sort descending
rename(Value = !!sym(col),
Frequency = n) %>%
mutate(Variable = col) %>% # add column name
select(Variable, Value, Frequency)
})
# Combine all into one data frame
freq_tables_df <- bind_rows(freq_tables)
# View
freq_tables_df
## Variable Value Frequency
## 1 job admin. 10422
## 2 job blue-collar 9254
## 3 job technician 6743
## 4 job services 3969
## 5 job management 2924
## 6 job retired 1720
## 7 job entrepreneur 1456
## 8 job self-employed 1421
## 9 job housemaid 1060
## 10 job unemployed 1014
## 11 job student 875
## 12 job unknown 330
## 13 marital married 24928
## 14 marital single 11568
## 15 marital divorced 4612
## 16 marital unknown 80
## 17 education university.degree 12168
## 18 education high.school 9515
## 19 education basic.9y 6045
## 20 education professional.course 5243
## 21 education basic.4y 4176
## 22 education basic.6y 2292
## 23 education unknown 1731
## 24 education illiterate 18
## 25 default no 32588
## 26 default unknown 8597
## 27 default yes 3
## 28 housing yes 21576
## 29 housing no 18622
## 30 housing unknown 990
## 31 loan no 33950
## 32 loan yes 6248
## 33 loan unknown 990
## 34 contact cellular 26144
## 35 contact telephone 15044
## 36 month may 13769
## 37 month jul 7174
## 38 month aug 6178
## 39 month jun 5318
## 40 month nov 4101
## 41 month apr 2632
## 42 month oct 718
## 43 month sep 570
## 44 month mar 546
## 45 month dec 182
## 46 day_of_week thu 8623
## 47 day_of_week mon 8514
## 48 day_of_week wed 8134
## 49 day_of_week tue 8090
## 50 day_of_week fri 7827
## 51 poutcome nonexistent 35563
## 52 poutcome failure 4252
## 53 poutcome success 1373
## 54 emp.var.rate 1.4 16234
## 55 emp.var.rate -1.8 9184
## 56 emp.var.rate 1.1 7763
## 57 emp.var.rate -0.1 3683
## 58 emp.var.rate -2.9 1663
## 59 emp.var.rate -3.4 1071
## 60 emp.var.rate -1.7 773
## 61 emp.var.rate -1.1 635
## 62 emp.var.rate -3 172
## 63 emp.var.rate -0.2 10
## 64 cons.price.idx 93.994 7763
## 65 cons.price.idx 93.918 6685
## 66 cons.price.idx 92.893 5794
## 67 cons.price.idx 93.444 5175
## 68 cons.price.idx 94.465 4374
## 69 cons.price.idx 93.2 3616
## 70 cons.price.idx 93.075 2458
## 71 cons.price.idx 92.201 770
## 72 cons.price.idx 92.963 715
## 73 cons.price.idx 92.431 447
## 74 cons.price.idx 92.649 357
## 75 cons.price.idx 94.215 311
## 76 cons.price.idx 94.199 303
## 77 cons.price.idx 92.843 282
## 78 cons.price.idx 92.379 267
## 79 cons.price.idx 93.369 264
## 80 cons.price.idx 94.027 233
## 81 cons.price.idx 94.055 229
## 82 cons.price.idx 93.876 212
## 83 cons.price.idx 94.601 204
## 84 cons.price.idx 92.469 178
## 85 cons.price.idx 93.749 174
## 86 cons.price.idx 92.713 172
## 87 cons.price.idx 94.767 128
## 88 cons.price.idx 93.798 67
## 89 cons.price.idx 92.756 10
## 90 cons.conf.idx -36.4 7763
## 91 cons.conf.idx -42.7 6685
## 92 cons.conf.idx -46.2 5794
## 93 cons.conf.idx -36.1 5175
## 94 cons.conf.idx -41.8 4374
## 95 cons.conf.idx -42 3616
## 96 cons.conf.idx -47.1 2458
## 97 cons.conf.idx -31.4 770
## 98 cons.conf.idx -40.8 715
## 99 cons.conf.idx -26.9 447
## 100 cons.conf.idx -30.1 357
## 101 cons.conf.idx -40.3 311
## 102 cons.conf.idx -37.5 303
## 103 cons.conf.idx -50 282
## 104 cons.conf.idx -29.8 267
## 105 cons.conf.idx -34.8 264
## 106 cons.conf.idx -38.3 233
## 107 cons.conf.idx -39.8 229
## 108 cons.conf.idx -40 212
## 109 cons.conf.idx -49.5 204
## 110 cons.conf.idx -33.6 178
## 111 cons.conf.idx -34.6 174
## 112 cons.conf.idx -33 172
## 113 cons.conf.idx -50.8 128
## 114 cons.conf.idx -40.4 67
## 115 cons.conf.idx -45.9 10
## 116 euribor3m 4.857 2868
## 117 euribor3m 4.962 2613
## 118 euribor3m 4.963 2487
## 119 euribor3m 4.961 1902
## 120 euribor3m 4.856 1210
## 121 euribor3m 4.964 1175
## 122 euribor3m 1.405 1169
## 123 euribor3m 4.965 1071
## 124 euribor3m 4.864 1044
## 125 euribor3m 4.96 1013
## 126 euribor3m 4.968 992
## 127 euribor3m 4.959 895
## 128 euribor3m 4.86 892
## 129 euribor3m 4.855 840
## 130 euribor3m 4.076 822
## 131 euribor3m 1.266 820
## 132 euribor3m 4.859 788
## 133 euribor3m 4.12 756
## 134 euribor3m 4.858 733
## 135 euribor3m 4.153 690
## 136 euribor3m 4.021 676
## 137 euribor3m 4.967 643
## 138 euribor3m 1.281 637
## 139 euribor3m 4.966 622
## 140 euribor3m 4.191 610
## 141 euribor3m 1.25 587
## 142 euribor3m 4.958 581
## 143 euribor3m 1.291 544
## 144 euribor3m 1.327 538
## 145 euribor3m 4.957 537
## 146 euribor3m 1.299 520
## 147 euribor3m 1.313 492
## 148 euribor3m 1.334 482
## 149 euribor3m 1.244 422
## 150 euribor3m 1.344 395
## 151 euribor3m 4.865 373
## 152 euribor3m 4.866 340
## 153 euribor3m 1.365 303
## 154 euribor3m 1.41 254
## 155 euribor3m 1.26 252
## 156 euribor3m 1.354 215
## 157 euribor3m 0.879 180
## 158 euribor3m 4.97 172
## 159 euribor3m 1.262 145
## 160 euribor3m 0.714 139
## 161 euribor3m 0.715 135
## 162 euribor3m 0.884 128
## 163 euribor3m 0.883 124
## 164 euribor3m 1.27 110
## 165 euribor3m 1.445 103
## 166 euribor3m 4.955 103
## 167 euribor3m 1.415 98
## 168 euribor3m 4.947 98
## 169 euribor3m 1.268 95
## 170 euribor3m 1.264 87
## 171 euribor3m 1.423 87
## 172 euribor3m 0.739 82
## 173 euribor3m 0.873 82
## 174 euribor3m 1.435 81
## 175 euribor3m 1.453 81
## 176 euribor3m 0.881 79
## 177 euribor3m 0.72 78
## 178 euribor3m 0.722 74
## 179 euribor3m 1.259 70
## 180 euribor3m 0.742 68
## 181 euribor3m 0.861 65
## 182 euribor3m 1.479 62
## 183 euribor3m 0.904 60
## 184 euribor3m 1.466 57
## 185 euribor3m 0.716 54
## 186 euribor3m 0.869 54
## 187 euribor3m 0.899 50
## 188 euribor3m 1.483 50
## 189 euribor3m 0.646 49
## 190 euribor3m 0.886 48
## 191 euribor3m 0.74 45
## 192 euribor3m 0.754 44
## 193 euribor3m 1.029 44
## 194 euribor3m 0.635 43
## 195 euribor3m 0.682 39
## 196 euribor3m 0.898 39
## 197 euribor3m 0.644 38
## 198 euribor3m 0.797 38
## 199 euribor3m 0.896 37
## 200 euribor3m 1.044 37
## 201 euribor3m 0.642 35
## 202 euribor3m 0.652 35
## 203 euribor3m 0.728 35
## 204 euribor3m 0.849 35
## 205 euribor3m 0.859 35
## 206 euribor3m 1.498 35
## 207 euribor3m 0.655 34
## 208 euribor3m 1.072 34
## 209 euribor3m 0.878 33
## 210 euribor3m 0.803 31
## 211 euribor3m 0.876 31
## 212 euribor3m 1.811 31
## 213 euribor3m 0.719 30
## 214 euribor3m 0.854 30
## 215 euribor3m 0.838 29
## 216 euribor3m 1.531 29
## 217 euribor3m 0.699 28
## 218 euribor3m 0.741 27
## 219 euribor3m 0.825 27
## 220 euribor3m 0.851 27
## 221 euribor3m 0.9 27
## 222 euribor3m 0.645 26
## 223 euribor3m 0.707 26
## 224 euribor3m 1.252 26
## 225 euribor3m 0.737 25
## 226 euribor3m 0.882 25
## 227 euribor3m 1.406 25
## 228 euribor3m 0.73 24
## 229 euribor3m 0.821 24
## 230 euribor3m 0.827 24
## 231 euribor3m 0.643 23
## 232 euribor3m 0.697 23
## 233 euribor3m 0.724 23
## 234 euribor3m 1.059 23
## 235 euribor3m 4.956 23
## 236 euribor3m 0.702 22
## 237 euribor3m 0.761 22
## 238 euribor3m 0.773 22
## 239 euribor3m 0.819 22
## 240 euribor3m 1.048 22
## 241 euribor3m 1.687 22
## 242 euribor3m 0.735 21
## 243 euribor3m 0.781 21
## 244 euribor3m 0.809 21
## 245 euribor3m 0.846 21
## 246 euribor3m 0.977 21
## 247 euribor3m 1.05 21
## 248 euribor3m 1.392 21
## 249 euribor3m 0.654 20
## 250 euribor3m 0.77 20
## 251 euribor3m 0.788 20
## 252 euribor3m 0.835 20
## 253 euribor3m 0.877 20
## 254 euribor3m 0.88 20
## 255 euribor3m 1.215 20
## 256 euribor3m 1.663 20
## 257 euribor3m 1.757 20
## 258 euribor3m 0.653 19
## 259 euribor3m 0.81 19
## 260 euribor3m 0.987 19
## 261 euribor3m 0.668 18
## 262 euribor3m 0.706 18
## 263 euribor3m 0.717 18
## 264 euribor3m 0.718 18
## 265 euribor3m 0.733 18
## 266 euribor3m 0.84 18
## 267 euribor3m 1 18
## 268 euribor3m 0.743 17
## 269 euribor3m 0.744 17
## 270 euribor3m 0.767 17
## 271 euribor3m 0.889 17
## 272 euribor3m 0.905 17
## 273 euribor3m 0.972 17
## 274 euribor3m 1.52 17
## 275 euribor3m 1.538 17
## 276 euribor3m 0.639 16
## 277 euribor3m 0.672 16
## 278 euribor3m 0.684 16
## 279 euribor3m 0.843 16
## 280 euribor3m 0.908 16
## 281 euribor3m 0.959 16
## 282 euribor3m 1.032 16
## 283 euribor3m 1.286 16
## 284 euribor3m 0.659 15
## 285 euribor3m 0.731 15
## 286 euribor3m 0.982 15
## 287 euribor3m 1.046 15
## 288 euribor3m 0.636 14
## 289 euribor3m 0.683 14
## 290 euribor3m 1.025 14
## 291 euribor3m 1.799 14
## 292 euribor3m 0.729 13
## 293 euribor3m 0.768 13
## 294 euribor3m 0.829 13
## 295 euribor3m 0.834 13
## 296 euribor3m 0.87 13
## 297 euribor3m 0.893 13
## 298 euribor3m 1.049 13
## 299 euribor3m 1.4 13
## 300 euribor3m 1.614 13
## 301 euribor3m 0.65 12
## 302 euribor3m 0.677 12
## 303 euribor3m 0.748 12
## 304 euribor3m 0.903 12
## 305 euribor3m 1.556 12
## 306 euribor3m 0.663 11
## 307 euribor3m 0.701 11
## 308 euribor3m 0.782 11
## 309 euribor3m 0.79 11
## 310 euribor3m 0.822 11
## 311 euribor3m 1.099 11
## 312 euribor3m 1.51 11
## 313 euribor3m 1.726 11
## 314 euribor3m 0.64 10
## 315 euribor3m 0.649 10
## 316 euribor3m 0.692 10
## 317 euribor3m 0.695 10
## 318 euribor3m 0.712 10
## 319 euribor3m 0.723 10
## 320 euribor3m 0.885 10
## 321 euribor3m 1.04 10
## 322 euribor3m 1.372 10
## 323 euribor3m 1.629 10
## 324 euribor3m 1.64 10
## 325 euribor3m 0.685 9
## 326 euribor3m 0.71 9
## 327 euribor3m 0.713 9
## 328 euribor3m 0.793 9
## 329 euribor3m 1.016 9
## 330 euribor3m 1.028 9
## 331 euribor3m 1.039 9
## 332 euribor3m 1.041 9
## 333 euribor3m 1.043 9
## 334 euribor3m 1.206 9
## 335 euribor3m 1.235 9
## 336 euribor3m 1.384 9
## 337 euribor3m 4.245 9
## 338 euribor3m 4.663 9
## 339 euribor3m 5.045 9
## 340 euribor3m 0.634 8
## 341 euribor3m 0.709 8
## 342 euribor3m 0.89 8
## 343 euribor3m 1.031 8
## 344 euribor3m 1.56 8
## 345 euribor3m 1.602 8
## 346 euribor3m 1.65 8
## 347 euribor3m 1.703 8
## 348 euribor3m 4.7 8
## 349 euribor3m 0.638 7
## 350 euribor3m 0.651 7
## 351 euribor3m 0.704 7
## 352 euribor3m 0.75 7
## 353 euribor3m 0.753 7
## 354 euribor3m 0.755 7
## 355 euribor3m 0.778 7
## 356 euribor3m 0.802 7
## 357 euribor3m 0.942 7
## 358 euribor3m 0.985 7
## 359 euribor3m 1.035 7
## 360 euribor3m 1.085 7
## 361 euribor3m 1.224 7
## 362 euribor3m 4.286 7
## 363 euribor3m 4.406 7
## 364 euribor3m 4.912 7
## 365 euribor3m 5 7
## 366 euribor3m 0.637 6
## 367 euribor3m 0.708 6
## 368 euribor3m 0.721 6
## 369 euribor3m 0.732 6
## 370 euribor3m 0.771 6
## 371 euribor3m 0.813 6
## 372 euribor3m 1.03 6
## 373 euribor3m 1.037 6
## 374 euribor3m 1.548 6
## 375 euribor3m 4.936 6
## 376 euribor3m 0.7 5
## 377 euribor3m 0.727 5
## 378 euribor3m 0.752 5
## 379 euribor3m 0.888 5
## 380 euribor3m 0.965 5
## 381 euribor3m 0.993 5
## 382 euribor3m 1.008 5
## 383 euribor3m 4.343 5
## 384 euribor3m 4.794 5
## 385 euribor3m 4.827 5
## 386 euribor3m 0.711 4
## 387 euribor3m 0.762 4
## 388 euribor3m 0.891 4
## 389 euribor3m 4.223 4
## 390 euribor3m 4.592 4
## 391 euribor3m 4.918 4
## 392 euribor3m 0.688 3
## 393 euribor3m 0.69 3
## 394 euribor3m 0.766 3
## 395 euribor3m 0.894 3
## 396 euribor3m 0.895 3
## 397 euribor3m 0.914 3
## 398 euribor3m 0.944 3
## 399 euribor3m 0.979 3
## 400 euribor3m 1.007 3
## 401 euribor3m 1.018 3
## 402 euribor3m 1.584 3
## 403 euribor3m 1.778 3
## 404 euribor3m 4.474 3
## 405 euribor3m 4.76 3
## 406 euribor3m 4.921 3
## 407 euribor3m 0.749 2
## 408 euribor3m 0.921 2
## 409 euribor3m 0.927 2
## 410 euribor3m 0.937 2
## 411 euribor3m 0.953 2
## 412 euribor3m 3.563 2
## 413 euribor3m 3.879 2
## 414 euribor3m 4.733 2
## 415 euribor3m 0.933 1
## 416 euribor3m 0.956 1
## 417 euribor3m 0.969 1
## 418 euribor3m 0.996 1
## 419 euribor3m 1.045 1
## 420 euribor3m 1.047 1
## 421 euribor3m 1.574 1
## 422 euribor3m 3.053 1
## 423 euribor3m 3.282 1
## 424 euribor3m 3.329 1
## 425 euribor3m 3.428 1
## 426 euribor3m 3.488 1
## 427 euribor3m 3.669 1
## 428 euribor3m 3.743 1
## 429 euribor3m 3.816 1
## 430 euribor3m 3.853 1
## 431 euribor3m 3.901 1
## 432 nr.employed 5228.1 16234
## 433 nr.employed 5099.1 8534
## 434 nr.employed 5191 7763
## 435 nr.employed 5195.8 3683
## 436 nr.employed 5076.2 1663
## 437 nr.employed 5017.5 1071
## 438 nr.employed 4991.6 773
## 439 nr.employed 5008.7 650
## 440 nr.employed 4963.6 635
## 441 nr.employed 5023.5 172
## 442 nr.employed 5176.3 10
## 443 y no 36548
## 444 y yes 4640
# Quick numeric summaries (mean, sd, min, max, quantiles)
sapply(bank_additional_df[sapply(bank_additional_df, is.numeric)], function(x) {
c(mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
q25 = quantile(x, 0.25, na.rm = TRUE),
median = median(x, na.rm = TRUE),
q75 = quantile(x, 0.75, na.rm = TRUE),
max = max(x, na.rm = TRUE))
})
## age duration campaign pdays previous
## mean 40.02406 258.2850 2.567593 962.4755 0.1729630
## sd 10.42125 259.2792 2.770014 186.9109 0.4949011
## min 17.00000 0.0000 1.000000 0.0000 0.0000000
## q25.25% 32.00000 102.0000 1.000000 999.0000 0.0000000
## median 38.00000 180.0000 2.000000 999.0000 0.0000000
## q75.75% 47.00000 319.0000 3.000000 999.0000 0.0000000
## max 98.00000 4918.0000 56.000000 999.0000 7.0000000
# Central tendency & spread (numerical variables)
numeric_vars <- bank_additional_df %>% select(where(is.numeric))
summary(numeric_vars)
## age duration campaign pdays
## Min. :17.00 Min. : 0.0 Min. : 1.000 Min. : 0.0
## 1st Qu.:32.00 1st Qu.: 102.0 1st Qu.: 1.000 1st Qu.:999.0
## Median :38.00 Median : 180.0 Median : 2.000 Median :999.0
## Mean :40.02 Mean : 258.3 Mean : 2.568 Mean :962.5
## 3rd Qu.:47.00 3rd Qu.: 319.0 3rd Qu.: 3.000 3rd Qu.:999.0
## Max. :98.00 Max. :4918.0 Max. :56.000 Max. :999.0
## previous
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.173
## 3rd Qu.:0.000
## Max. :7.000
# Boxplots to check central tendency and outliers
numeric_vars_long <- numeric_vars %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(numeric_vars_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "skyblue") +
theme_minimal() +
ggtitle("Boxplots of Numeric Variables (Central Tendency & Outliers)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Overall distribution of each variable
# Histograms for numeric variables
ggplot(numeric_vars_long, aes(x = Value)) +
geom_histogram(fill = "lightgreen", color = "black", bins = 30) +
facet_wrap(~ Variable, scales = "free") +
theme_minimal() +
ggtitle("Histograms of Numeric Variables")
# Patterns or trends in data
# Example: average duration by job
bank_additional_df %>%
group_by(job) %>%
summarise(Average_Duration = mean(duration, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(job, Average_Duration), y = Average_Duration)) +
geom_bar(stat = "identity", fill = "purple") +
coord_flip() +
theme_minimal() +
ggtitle("Average Call Duration by Job")
# Correlation between numeric variables
corr_matrix <- cor(numeric_vars, use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black")
# Correlation matrix for numeric columns
numeric_vars <- bank_additional_df[sapply(bank_additional_df, is.numeric)]
cor(numeric_vars, use = "complete.obs")
## age duration campaign pdays previous
## age 1.000000000 -0.000865705 0.00459358 -0.03436895 0.02436474
## duration -0.000865705 1.000000000 -0.07169923 -0.04757702 0.02064035
## campaign 0.004593580 -0.071699226 1.00000000 0.05258357 -0.07914147
## pdays -0.034368951 -0.047577015 0.05258357 1.00000000 -0.58751386
## previous 0.024364741 0.020640351 -0.07914147 -0.58751386 1.00000000
age & duration (-0.00087): Essentially zero correlation; age of client has no linear relationship with call duration. age & campaign (0.0046): Nearly zero; older clients are not contacted more or less often. age & pdays (-0.034): Very weak negative correlation; older clients slightly more likely to have been contacted recently in previous campaigns, but effect is negligible. age & previous (0.024): Essentially no correlation; age does not relate to prior contacts. duration & campaign (-0.072): Very weak negative correlation; longer calls slightly associated with fewer calls in this campaign. duration & pdays (-0.048): Very weak negative correlation; call duration not meaningfully related to days since last contact. duration & previous (0.021): Almost zero; prior contacts do not affect call length. campaign & pdays (0.053): Very weak positive correlation; number of contacts in this campaign is slightly higher for clients contacted longer ago. campaign & previous (-0.079): Very weak negative correlation; more prior contacts slightly associated with fewer contacts in this campaign. pdays & previous (-0.588): Moderate to strong negative correlation; as days since last contact (pdays) increases, the number of prior contacts decreases. Makes sense: if someone was contacted long ago, there were fewer previous contacts.
Key takeaways:
Most variables have very weak correlations (close to 0), meaning they are largely independent.
The only strong relationship is between pdays and previous (-0.588), which is meaningful for modeling.
Variables like age, duration, and campaign are not strongly correlated with each other, so multicollinearity is unlikely among them.
# Relationships between different variables
# Scatterplot matrix for numeric variables
pairs(numeric_vars, main = "Scatterplot Matrix of Numeric Variables", pch = 19, col = "blue")
# Or using ggplot for two selected variables
ggplot(bank_additional_df, aes(x = pdays, y = previous)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", col = "red") +
theme_minimal() +
ggtitle("Relationship: pdays vs previous")
## `geom_smooth()` using formula = 'y ~ x'
# Scatter plots for numeric variables (interactive)
# Example: campaign vs duration
plot_ly(bank_additional_df,
x = ~campaign,
y = ~duration,
type = 'scatter',
mode = 'markers',
color = ~job, # optional: color by a categorical variable
text = ~paste("Age:", age, "<br>Previous:", previous),
marker = list(size = 10, opacity = 0.7)) %>%
layout(title = "Relationship: Campaign vs Duration",
xaxis = list(title = "Number of Contacts in Campaign"),
yaxis = list(title = "Call Duration (seconds)"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
cor_matrix <- round(cor(numeric_vars, use = "complete.obs"), 2)
plot_cor <- plot_ly(x = colnames(cor_matrix), y = rownames(cor_matrix), z = cor_matrix,
type = "heatmap", colorscale = "Viridis") %>%
layout(title = "Correlation Heatmap of Numeric Variables")
plot_cor
missing_summary <- bank_additional_df %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Missing_Count")
plot_missing <- plot_ly(missing_summary, x = ~Variable, y = ~Missing_Count, type = "bar", color = ~Variable) %>%
layout(title = "Missing Values per Column",
xaxis = list(title = "Variable"),
yaxis = list(title = "Missing Count"))
plot_missing
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_box <- plot_ly(bank_additional_df,
x = ~job,
y = ~duration,
type = 'box',
color = ~job,
boxpoints = 'all',
jitter = 0.3,
pointpos = -1.8) %>%
layout(title = "Call Duration by Job",
xaxis = list(title = "Job"),
yaxis = list(title = "Duration"))
plot_box
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
numeric_vars <- bank_additional_df %>% select(where(is.numeric))
numeric_long <- numeric_vars %>% pivot_longer(everything(), names_to = "Variable", values_to = "Value")
plot_numeric_hist <- plot_ly(numeric_long,
x = ~Value,
type = 'histogram',
color = ~Variable,
nbinsx = 30) %>%
layout(barmode = "overlay",
title = "Distribution of Numeric Variables",
xaxis = list(title = "Value"),
yaxis = list(title = "Count"))
plot_numeric_hist
# Relationships between numeric variables (scatter matrix)
plot_splom <- plot_ly(type = 'splom',
dimensions = list(
list(label = 'Age', values = numeric_vars$age),
list(label = 'Duration', values = numeric_vars$duration),
list(label = 'Campaign', values = numeric_vars$campaign),
list(label = 'Pdays', values = numeric_vars$pdays),
list(label = 'Previous', values = numeric_vars$previous)
),
marker = list(color = 'rgba(0, 100, 200, 0.5)', size = 5)) %>%
layout(title = 'Scatterplot Matrix of Numeric Variables')
plot_splom